# Python 3.8

#############################
### Packages
#############################
import pandas as pd

#############################
### Output Filename 
#############################

Ecology_Edgelist_Filename = "/Path/Data/OUTPUT_Python_MAG_Ecology_Citations_GRID_Edgelist.csv.gz"

###############################
### Queries | Run on AWS Web Portal and Download CSVs and Rename
###############################

# INPUT_Global_Ecology_of_Citations_GRID_Edgelist

ecology_edgelist_query = '''
SELECT a.fieldofstudyid as Field,
       a.childfieldofstudyid as SubField, 
       f.country as Country,
       f.grid as GRID,
       c.full_year as Year,
       e.full_year as Year_That_Is_Citing,
       count(*) as Citations_Received

FROM (SELECT fieldofstudyid, 
             childfieldofstudyid
      FROM "mag_data"."fieldofstudychildren"
      WHERE fieldofstudyid = 121332964 OR /*Physics*/
            fieldofstudyid = 162324750 OR /*Economics*/
            fieldofstudyid = 185592680 OR /*Chemistry*/
            fieldofstudyid = 33923547 OR /*Math*/
            fieldofstudyid = 86803240 OR /*Biology*/
            fieldofstudyid = 15744967 OR /*Psychology*/
            fieldofstudyid = 144133560 OR /*Business*/
            fieldofstudyid = 71924100 OR /*Medicine*/
            fieldofstudyid = 192562407 OR /*Materials Science*/
            fieldofstudyid = 127313418 OR /*Geology*/
            fieldofstudyid = 39432304 OR /*Environmental Science*/
            fieldofstudyid = 41008148 /*Computer Science*/
            ) a

LEFT JOIN "mag_data"."paperfieldsofstudy" b
ON b.fieldofstudyid = a.childfieldofstudyid

LEFT JOIN "mag_data"."papers" c
ON c.paperid = b.paperid

LEFT JOIN "mag_data"."paperreferences" d
ON d.paperid = b.paperid

/*Chunk of SQL to remove self-ites*/
LEFT JOIN (Select *
                  From
                     (SELECT af.paperid, 
                             array_agg(DISTINCT cf.grid) as CitingGRID 
                       FROM "mag_data"."paperauthoraffiliations" af
                       LEFT JOIN "mag_data"."affiliations" bf
                       ON af.affiliationid = bf.affiliationid 
                       LEFT JOIN "mag_data"."grid" cf
                       ON bf.gridid = cf.gridid
                       WHERE bf.gridid<>''
                       GROUP BY af.paperid)) g
ON g.paperid = d.paperreferenceid
/*Chunk of SQL to remove self-ites*/
           
LEFT JOIN "mag_data"."papers" e
ON d.paperreferenceid = e.paperid

LEFT JOIN (SELECT af.paperid, cf.country, cf.grid  
           FROM "mag_data"."paperauthoraffiliations" af
           LEFT JOIN "mag_data"."affiliations" bf
           ON af.affiliationid = bf.affiliationid 
           LEFT JOIN "mag_data"."grid" cf
           ON bf.gridid = cf.gridid
           WHERE bf.gridid<>'') f
ON f.paperid = b.paperid

WHERE f.country IS NOT NULL AND
      f.GRID IS NOT NULL AND
      e.full_year IS NOT NULL AND
      c.full_year>=1980 AND 
      cardinality(array_intersect(ARRAY[f.grid], g.CitingGRID))<>1 AND 
      (e.full_year - c.full_year)>=0

GROUP BY a.fieldofstudyid,
       a.childfieldofstudyid, 
       f.country,
       f.GRID,
       c.full_year,
       e.full_year
       
       Order by a.fieldofstudyid,
       a.childfieldofstudyid, 
       f.country,
       f.GRID,
       c.full_year,
       e.full_year
'''

# INPUT_Global_Ecology_of_Citations_Number_of_Docs_per_Field
Field_Number_of_Documents_query = '''
SELECT a.fieldofstudyid as Field,
       c.normalizedname as Field_Name,
       b.full_year as Year,
       count(*) as Number_of_Documents

FROM (SELECT paperid, fieldofstudyid 
      FROM "mag_data"."paperfieldsofstudy"
      WHERE fieldofstudyid = 121332964 OR /*Physics*/
            fieldofstudyid = 162324750 OR /*Economics*/
            fieldofstudyid = 185592680 OR /*Chemistry*/
            fieldofstudyid = 33923547 OR /*Math*/
            fieldofstudyid = 86803240 OR /*Biology*/
            fieldofstudyid = 15744967 OR /*Psychology*/
            fieldofstudyid = 144133560 OR /*Business*/
            fieldofstudyid = 71924100 OR /*Medicine*/
            fieldofstudyid = 192562407 OR /*Materials Science*/
            fieldofstudyid = 127313418 OR /*Geology*/
            fieldofstudyid = 39432304 OR /*Environmental Science*/
            fieldofstudyid = 41008148 /*Computer Science*/
            ) a

LEFT JOIN "mag_data"."papers" b
ON a.paperid = b.paperid

LEFT JOIN (
SELECT "fieldofstudyid","normalizedname" FROM "mag_data"."fieldsofstudy" 
) c
ON a.fieldofstudyid = c.fieldofstudyid

WHERE b.full_year>=1980  

GROUP BY a.fieldofstudyid,
       c.normalizedname,
       b.full_year
'''

# INPUT_Global_Ecology_of_Citations_Field_Names
field_name_query = '''
SELECT fieldofstudyid, normalizedname FROM "mag_data"."fieldsofstudy" 
'''

###############################
### Read in Files
###############################


df_SQL_Ecology_Edgelist = pd.read_csv("/Path/Data/INPUT_Global_Ecology_of_Citations_GRID_Edgelist.csv.gz")
df_Number_of_Documents_Year = pd.read_csv("/Path/Data/INPUT_Global_Ecology_of_Citations_Number_of_Docs_per_Field.csv.gz")
df_Field_Names = pd.read_csv("/Path/Data/INPUT_Global_Ecology_of_Citations_Field_Names.csv.gz")

# Note - First download CSVs from Athena Query and then Rename
# Next Read in the Files as CSVs and then Zip and Re-Output Them.

# df_SQL_Ecology_Edgelist = pd.read_csv("INPUT_Global_Ecology_of_Citations_GRID_Edgelist.csv")
# df_Number_of_Documents_Year = pd.read_csv("INPUT_Global_Ecology_of_Citations_Number_of_Docs_per_Field.csv")
# df_Field_Names = pd.read_csv("INPUT_Global_Ecology_of_Citations_Field_Names.csv")

# df_SQL_Ecology_Edgelist.to_csv("INPUT_Global_Ecology_of_Citations_GRID_Edgelist.csv.gz",index=False,compression='gzip')
# df_Number_of_Documents_Year.to_csv("INPUT_Global_Ecology_of_Citations_Number_of_Docs_per_Field.csv.gz",index=False,compression='gzip')
# df_Field_Names.to_csv("INPUT_Global_Ecology_of_Citations_Field_Names.csv.gz",index=False,compression='gzip')

###############################
### Combine Files
###############################

### Number of Documents
df_Ecology_Edgelist = pd.merge(df_SQL_Ecology_Edgelist,df_Number_of_Documents_Year.drop(columns="Field_Name"),on=["Field","Year"],how="left").rename(columns={"Number_of_Documents":"Year_Number_of_Documents"})
df_Ecology_Edgelist = pd.merge(df_Ecology_Edgelist,df_Number_of_Documents_Year.drop(columns="Field_Name"),left_on=["Field","Year_That_Is_Citing"],right_on=["Field","Year"],how="left").rename(columns={"Number_of_Documents":"Year_That_Is_Citing_Number_of_Documents"})
df_Ecology_Edgelist = df_Ecology_Edgelist.drop(columns=["Year_y"]).rename(columns={"Year_x":"Year"})


df_Ecology_Edgelist = pd.merge(df_Ecology_Edgelist,df_Field_Names,left_on=["Field"],right_on=["fieldofstudyid"],how="left").rename(columns={"normalizedname":"Field_Name"}).drop(columns=["fieldofstudyid","Field"])
df_Ecology_Edgelist = pd.merge(df_Ecology_Edgelist,df_Field_Names,left_on=["SubField"],right_on=["fieldofstudyid"],how="left").rename(columns={"normalizedname":"SubField_Name"}).drop(columns=["fieldofstudyid","SubField"])

df_Ecology_Edgelist = df_Ecology_Edgelist.rename(columns={"Field_Name":"Field","SubField_Name":"SubField"})

df_Ecology_Edgelist.columns = [x.upper() for x in df_Ecology_Edgelist.columns]

df_Ecology_Edgelist["FIELD"] = df_Ecology_Edgelist["FIELD"].str.title()
df_Ecology_Edgelist["SUBFIELD"] = df_Ecology_Edgelist["SUBFIELD"].str.title()


df_Ecology_Edgelist.to_csv(Ecology_Edgelist_Filename,index=False,compression='gzip')



